
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import os
import pandas as pd
import numpy as np
from C_PatentVariables import collecting_patstat, family_aggregation, firm_year_aggregation


def main():
    LOGGER.info('SCRIPT: g_aggregating_families_yearlevel.py')
    LOGGER.info("       AGGREGATION YEAR LEVEL")
    namefile = 'ipc_cpc_transpo'
    df_agg_corres = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_{}.csv'.format(namefile))
    # Just a trick to reuse the code developed to output data at firm-year. pretend this is one firm
    df_agg_corres['firm_agg_id'] = 1
    df_year_docdbid = df_agg_corres[['firm_agg_id', 'docdb_family_id', 'earliest_filing_year']].drop_duplicates()
    df_year_docdbid = df_year_docdbid.rename(columns={'earliest_filing_year': 'year_agg'})
    df_year = firm_year_aggregation.counting_families(df_year_docdbid, namefile, 'Count')
    df_year = df_year.reset_index().rename(columns={'year_agg': 'earliest_filing_year'}).drop(columns='firm_agg_id')
    df_year.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/TimeSeries_FamilyCounts_in_{}_from_all.csv'.format(namefile), index=False)
    LOGGER.info('TimeSeries_FamilyCounts_from_all_{}.csv Saved'.format(namefile))
    # Calculate similar counts but for the subsets of families originating from particular firms: OEMs, suppliers, ...
    df_overlap = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_overlap.csv')
    for firmtype in ['OEM', 'Subsidiaries', 'Suppliers']:
        mask = df_year_docdbid['docdb_family_id'].isin(df_overlap[df_overlap[firmtype]]['docdb_family_id'].tolist())
        df_year = firm_year_aggregation.counting_families(df_year_docdbid[mask], namefile, 'Count')
        df_year = df_year.reset_index().rename(columns={'year_agg': 'earliest_filing_year'}).drop(columns='firm_agg_id')
        df_year.to_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/TimeSeries_FamilyCounts_in_{namefile}_from_{firmtype}.csv', index=False)
    # Both OEMs and subsidiaries
    listdocdbids = df_overlap[(df_overlap['OEM']) | (df_overlap['Subsidiaries'])]['docdb_family_id'].tolist()
    mask = df_year_docdbid['docdb_family_id'].isin(listdocdbids)
    df_year = firm_year_aggregation.counting_families(df_year_docdbid[mask], namefile, 'Count')
    df_year = df_year.reset_index().rename(columns={'year_agg': 'earliest_filing_year'}).drop(columns='firm_agg_id')
    df_year.to_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/TimeSeries_FamilyCounts_in_{namefile}_from_OEM_and_Subsidiaries.csv', index=False)
    LOGGER.info('SCRIPT END: g_aggregating_families_yearlevel.py')
    return

